home *** CD-ROM | disk | FTP | other *** search
- /* WIDE AREA INFORMATION SERVER SOFTWARE
- No guarantees or restrictions. See the readme file for the full standard
- disclaimer.
- Brewster@think.com
- */
-
- /* Looks up words in the inverted file index.
- * Please pardon my novice C code.
- *
- * -brewster
- */
-
- /* Important functions:
- * run_search
- * search_for_words
- */
-
- /* to do:
- * handle the null request by answering something.
- * answer questions that are just "help" and "?"
- * Handle searches on multiple databases
- */
-
- /* changes 5.2.90 HWM
- - changed calls to perror() to calls to panic()
- - made print_best_hits() only print hits w/ non-zero weight
- - made random arrays static instead of reading them in.
- removed getRandomArray.
- - removed unused variables
- Brewster 7/90 made look_up_word_in_dictionary safer.
- Brewster 7/90 elimiated trailing <lf> on filename and headline table accesses
- HWM 7.12.90 - replaced all calls to panic with error code returns and a log
- file
- - added the routine initSearchEngine() which should be called
- before any other search routine
- - added beFriendly() to give other processes time under
- multifinder
- JG 5.31.91 - added relevance feedback for line fragments.
- JG 7.8.91 - added doc_id to search_for_words, removed scale_scores.
- */
-
- #define _search_c
-
- #include <ctype.h>
-
- #include <string.h> /* for strlen() */
- #ifdef THINK_C
- #include <unix.h> /* for sleep() */
- #endif /* think_c */
-
- #include "cutil.h"
- #include "irfiles.h"
- #include "irlex.h"
- #include "irext.h"
- #include "irsearch.h"
- #include "docid.h"
- #include <math.h>
-
- #define TEST_SEARCH false /* set to TRUE to allow printing to console */
-
- /*----------------------------------------------------------------------*/
-
- static Boolean calcDocLength _AP((hit* theHit,long* lines,long* bytes));
-
- static Boolean
- calcDocLength(theHit,lines,bytes)
- hit* theHit;
- long* lines;
- long* bytes;
- /* Given a hit, open the file and figure out how many bytes and lines
- it contains. This is not needed by the serial search engine (it
- stores these values in its dictionary. It is used by the dynamic
- help facility).
- */
- {
- *lines = theHit->number_of_lines;
-
- /* find the length of the document */
- if(theHit->end_character != 0)
- {
- /* document is not whole file, so size is stored */
- *bytes = theHit->end_character - theHit->start_character;
- return(true);
- }
- else
- {
- /* whole file, find file length from the file */
- FILE* file = NULL;
- if (((file = s_fopen(theHit->filename, "r")) != NULL) &&
- (s_fseek(file, 0L, SEEK_END) == 0) &&
- ((*bytes = ftell(file)) != -1))
- { s_fclose(file);
- return(true); /* we are done, bytes is set */
- }
- else
- { s_fclose(file);
- return(false); /* something went wrong with the file */
- }
- }
- }
-
-
-
-
- static long wordDelimiter _AP((long c));
-
- static long wordDelimiter(c)
- long c;
- /* decide if c is a delimiter or not */
- {
- if (isalnum((char)(c & 0xFF)))
- return(NOT_DELIMITER);
- else
- return(IS_DELIMITER);
- }
-
- boolean search_for_words(words, db, doc_id)
- char* words;
- /* break the string into words (delimited by non-alphanumerics)
- and repeatedly call
- search_for_word(). Note that the string is modified in the process!
- XXX could do something interesting to return feedback on which of the seedwords
- was most/least important
- Returns true if successful.
- */
- database *db;
- long doc_id;
- {
- char* word = NULL;
- /* printf("words: %s\n", words); */
- word = strtokf(words,wordDelimiter);
- while(word != NULL){
- long dictionary_value;
- /* trim the string if necessary */
- if(strlen(word) > MAX_WORD_LENGTH){
- word[MAX_WORD_LENGTH] = '\0';
- }
- dictionary_value = look_up_word_in_dictionary(string_downcase(word), db);
- if(dictionary_value > 0){
- if(0 != search_word(word, 0L, 0L, 1L, doc_id, dictionary_value, db))
- return(false);
- }
- word = strtokf(NULL,NULL);
- beFriendly();
- }
- return(true);
- }
-
- /* gets the next best hit from the search engine and fills in all the slots.
- If the document does not exist, then it gets another, etc.
- It returns 0 if successful */
- long next_best_hit(the_best_hit, db)
- hit *the_best_hit;
- database *db;
- {
- document_table_entry doc_entry;
- long ret_value;
- while(1){ /* keep going until we get a good document */
- if(0 != (ret_value = best_hit(&(the_best_hit->document_id), &(the_best_hit->weight))))
- return(ret_value);
- if(the_best_hit->weight <= 0) /* if we are out of good stuff, return */
- return(1);
- /* fill in the rest of the hit */
- if (read_document_table_entry(&doc_entry,
- the_best_hit->document_id,
- db)
- == true){
- the_best_hit->start_character = doc_entry.start_character;
- the_best_hit->end_character = doc_entry.end_character;
- the_best_hit->document_length = doc_entry.document_length;
- the_best_hit->number_of_lines = doc_entry.number_of_lines;
- sprintf(the_best_hit->date, "%d", doc_entry.date);
- read_filename_table_entry(doc_entry.filename_id,
- the_best_hit->filename,
- the_best_hit->type,
- NULL,
- db),
- strncpy(the_best_hit->headline,
- read_headline_table_entry(doc_entry.headline_id,db),
- MAX_HEADLINE_LEN);
- if(probe_file(the_best_hit->filename))
- return(0); /* we win */
- else /* we lose */
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "Dangling File %s in database %s.",
- the_best_hit->filename,
- db->database_file);
- /*
- strncpy(the_best_hit->headline, "***Missing Document***: ",
- MAX_HEADLINE_LEN);
- strncat(the_best_hit->headline,
- read_headline_table_entry(doc_entry.headline_id,db),
- MAX_HEADLINE_LEN - strlen(the_best_hit->headline));
- return(0);
- */
- }
- }
- else {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "Error reading doc_table_entry for database %s, docid: %ld",
- db->database_file,
- the_best_hit->document_id);
- }
- beFriendly();
- }
- }
-
- /*----------------------------------------------------------------------*/
-
- boolean run_search(aSearch, headers, diags, index_directory,
- seed_words_used, waisProtocolVersion, headerNum)
- SearchAPDU* aSearch;
- WAISDocumentHeader** headers; /* list of results */
- diagnosticRecord*** diags; /* list of diagnostics */
- char *index_directory;
- char **seed_words_used; /* called with enough space */
- long waisProtocolVersion;
- long *headerNum;
- /* runs a search on the inverted file index and returns false if it errors
- in such a way that it can not even make a diagnostic record
- (should not happen).
- It changes headers with the replies or makes a diagnostic record
- */
- {
- diagnosticRecord* diag = NULL;
- WAISSearch* wais_search = (WAISSearch*)aSearch->Query; /* for convenience */
- char* new_db_name = (aSearch->DatabaseNames == NULL) ?
- merge_pathnames(INFO_DATABASE_NAME, index_directory) :
- merge_pathnames(aSearch->DatabaseNames[0], index_directory);
- char* dbName = new_db_name;
- database* db;
- long maxRawScore;
- long normalScore;
- char* originName = NULL;
- long i;
- query_parameter_type parameters;
- boolean search_result;
-
- db = openDatabase(new_db_name, false, true);
- if (db == NULL)
- { char msg[MAX_FILENAME_LEN * 2];
- strncpy(msg,"The following database is not available: ",
- MAX_FILENAME_LEN);
- s_strncat(msg,new_db_name,MAX_FILENAME_LEN,MAX_FILENAME_LEN);
- diag = makeDiag(false,D_PermanentSystemError,msg);
- *diags = (diagnosticRecord **)s_realloc(*diags,(size_t)(sizeof(diagnosticRecord*) * 2));
- (*diags)[0] = diag;
- (*diags)[1] = NULL;
- return(false);
- }
-
- {
- DocObj** docs = NULL;
-
- /* read the query */
- docs = wais_search->Docs;
- if(docs != NULL) {
- if(docs[0] != NULL && docs[0]->Type != NULL) {
- long id = -1;
- if(strcmp(docs[0]->Type,"WAIS_NEXT") == 0)
- id = next_docid(anyToString(GetLocalID(docIDFromAny(docs[0]->DocumentID))),
- db);
- else if(strcmp(docs[0]->Type,"WAIS_PREV") == 0)
- id = previous_docid(anyToString(GetLocalID(docIDFromAny(docs[0]->DocumentID))),
- db);
- if (id > -1) {
- document_table_entry doc_entry;
- hit foo;
- long lines,length;
- DocID* theDocID = NULL;
- char local_id[MAX_FILENAME_LEN + 60]; /* filename, start, end */
- local_id[0] = '\0';
-
- if (read_document_table_entry(&doc_entry, id, db) == true) {
- foo.start_character = doc_entry.start_character;
- foo.end_character = doc_entry.end_character;
- foo.document_length = doc_entry.document_length;
- foo.number_of_lines = doc_entry.number_of_lines;
-
- read_filename_table_entry(doc_entry.filename_id,
- foo.filename,
- foo.type,
- NULL,
- db),
- strncpy(foo.headline,
- read_headline_table_entry(doc_entry.headline_id,db),
- MAX_HEADLINE_LEN);
- sprintf(foo.date, "%d", doc_entry.date);
- sprintf(local_id, "%ld %ld %s",
- doc_entry.start_character,
- doc_entry.end_character,
- foo.filename);
-
- if (calcDocLength(&(foo),&lines,&length))
- { /* this document is good, return it */
- char** type = NULL;
-
- if (waisProtocolVersion >= '2')
- { type = (char**)s_malloc((size_t)(sizeof(char*) * 2));
- type[0] = s_strdup(foo.type);
- type[1] = NULL;
- }
- else
- type = NULL;
-
- theDocID = makeDocID();
- theDocID->originalDatabase = stringToAny(dbName);
- theDocID->originalLocalID = stringToAny(local_id);
- headers[(*headerNum)++] =
- makeWAISDocumentHeader(anyFromDocID(theDocID),
- UNUSED,
- -1L,
- UNUSED,length,lines,
- type,
- s_strdup(dbName),
- s_strdup(foo.date),
- s_strdup(foo.headline),
- NULL);
- headers[*headerNum] = NULL;
- freeDocID(theDocID);
- return(true);
- }
- else
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "document <%ld %ld %s> skipped.",
- doc_entry.start_character,
- doc_entry.end_character,
- foo.filename);
- return(true);
- }
- }
-
- }
- }
- }
- }
- /* until seed_words_used is supported */
- strcpy(*seed_words_used, wais_search->SeedWords);
-
- /* note that the serial search engine does not do relevance feedback.
- As such, fed back doc-id's are ignored. In a real system, we might
- want to generate diagnostics if such an id was inappropriate for this
- database (of course the UI should intercept such requests in the first
- place - but...It has no way of knowing what a server can handle!)
- */
-
- parameters.max_hit_retrieved = wais_search->MaxDocumentsRetrieved;
- set_query_parameter(SET_MAX_RETRIEVED_MASK, ¶meters);
-
- search_result = false;
-
-
- #ifdef RELEVANCE_FEEDBACK
- #define MAX_TEXT_SIZE 10000 /* Maximume size of relevant text */
- {
- WAISDocumentText *doctext, *getData(), *getDocumentText();
- DocObj** docs = NULL;
- DocObj* doc = NULL;
-
- /* read the query */
- docs = wais_search->Docs;
- if(docs != NULL) {
- /* assemble the elements and construct a response */
- for (i = 0, doc = docs[i]; doc != NULL; doc = docs[++i])
- {
- if(doc->Type == NULL ||
- strcmp(doc->Type,"TEXT") == 0 ||
- doc->Type[0] == 0) {
-
- long errorCode;
- doctext = NULL;
-
- if (doc->ChunkCode == CT_line)
- doctext = getDocumentText(doc, dbName, &errorCode);
- else if ((doc->ChunkCode == CT_byte) ||
- (doc->ChunkCode == CT_document))
- doctext = getData(doc, dbName, &errorCode);
- if (doctext != NULL) {
- if(doctext->DocumentText->size > MAX_TEXT_SIZE)
- doctext->DocumentText->bytes[MAX_TEXT_SIZE] = 0;
- search_result |=
- search_for_words(doctext->DocumentText->bytes, db, 1);
- freeWAISDocumentText(doctext);
- }
- }
- }
- }
- }
- #endif /* RELEVANT_FEEDBACK */
-
- search_result |= search_for_words(wais_search->SeedWords, db, 0);
-
- if (search_result == true)
- { /* the search went ok */
- hit best_hit;
- originName = dbName;
-
- finished_search_word(db);
- for (i = 0; i < wais_search->MaxDocumentsRetrieved; i++){
- if(0 != next_best_hit(&best_hit, db))
- break; /* out of hits */
- if(i == 0)
- maxRawScore = best_hit.weight;
- if (best_hit.weight > 0){
- long lines,length;
- DocID* theDocID = NULL;
- char local_id[MAX_FILENAME_LEN + 60]; /* filename, start, end */
- local_id[0] = '\0';
-
- if (calcDocLength(&(best_hit),&lines,&length))
- { /* this document is good, return it */
- char** type = NULL;
- normalScore = (long)floor(
- (((double)best_hit.weight) /
- ((double)maxRawScore)) *
- (MAX_NORMAL_SCORE + 1));
- if (normalScore > MAX_NORMAL_SCORE)
- normalScore = MAX_NORMAL_SCORE;
-
- sprintf(local_id, "%ld %ld %s",
- best_hit.start_character,
- best_hit.end_character,
- best_hit.filename);
-
- if (waisProtocolVersion >= '2')
- { type = (char**)s_malloc((size_t)(sizeof(char*) * 2));
- type[0] = s_strdup(best_hit.type);
- type[1] = NULL;
- }
- else
- type = NULL;
- /*
- printf("header %ld out of %ld\n", *headerNum,
- wais_search->MaxDocumentsRetrieved);
- */
- theDocID = makeDocID();
- theDocID->originalDatabase = stringToAny(originName);
- theDocID->originalLocalID = stringToAny(local_id);
- headers[(*headerNum)++] =
- makeWAISDocumentHeader(anyFromDocID(theDocID),
- UNUSED,
- (long)normalScore,
- UNUSED,length,lines,
- type,
- s_strdup(originName),
- s_strdup(best_hit.date),
- s_strdup(best_hit.
- headline),
- NULL);
- headers[*headerNum] = NULL;
- freeDocID(theDocID);
- }
- else
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "document <%ld %ld %s> skipped.",
- best_hit.start_character,
- best_hit.end_character,
- best_hit.filename);
- return(true);
- }
- }
- }
- }
- else
- { /* something went awry in the search */
- diag = makeDiag(true,D_PermanentSystemError,
- "Serious error in server");
- *diags = (diagnosticRecord**)s_realloc(*diags,(size_t)(sizeof(diagnosticRecord*) * 2));
- (*diags)[0] = diag;
- (*diags)[1] = NULL;
- }
- finished_best_hit();
- /* free everything */
- closeDatabase(db);
- return(true);
- }
-